This files contains an example of tuning a Stacked Model with BayesSearchCV.
import pickle
import time
import helpsk as hlp
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
import plotly.io as pio
pio.renderers.default='notebook'
with open('../X_train.pkl', 'rb') as handle:
X_train = pickle.load(handle)
with open('../y_train.pkl', 'rb') as handle:
y_train = pickle.load(handle)
hlp.pandas.numeric_summary(X_train)
| # of Non-Nulls | # of Nulls | % Nulls | # of Zeros | % Zeros | Mean | St Dev. | Coef of Var | Skewness | Kurtosis | Min | 10% | 25% | 50% | 75% | 90% | Max | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| duration | 760 | 40 | 5.0% | 0 | 0.0% | 21.0 | 11.7 | 0.6 | 1.0 | 0.6 | 4.0 | 9.0 | 12.0 | 18.0 | 24.0 | 36.0 | 60.0 |
| credit_amount | 800 | 0 | 0.0% | 38 | 5.0% | 3,203.9 | 2,932.3 | 0.9 | 1.9 | 3.9 | 0.0 | 753.9 | 1,300.8 | 2,236.5 | 3,951.5 | 7,394.6 | 18,424.0 |
| installment_commitment | 800 | 0 | 0.0% | 0 | 0.0% | 3.0 | 1.1 | 0.4 | -0.5 | -1.2 | 1.0 | 1.0 | 2.0 | 3.0 | 4.0 | 4.0 | 4.0 |
| residence_since | 800 | 0 | 0.0% | 0 | 0.0% | 2.9 | 1.1 | 0.4 | -0.3 | -1.4 | 1.0 | 1.0 | 2.0 | 3.0 | 4.0 | 4.0 | 4.0 |
| age | 800 | 0 | 0.0% | 0 | 0.0% | 35.6 | 11.4 | 0.3 | 1.0 | 0.7 | 19.0 | 23.0 | 27.0 | 33.0 | 42.0 | 52.0 | 75.0 |
| existing_credits | 800 | 0 | 0.0% | 0 | 0.0% | 1.4 | 0.6 | 0.4 | 1.3 | 1.6 | 1.0 | 1.0 | 1.0 | 1.0 | 2.0 | 2.0 | 4.0 |
| num_dependents | 800 | 0 | 0.0% | 0 | 0.0% | 1.1 | 0.3 | 0.3 | 2.0 | 2.1 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 2.0 | 2.0 |
hlp.pandas.non_numeric_summary(X_train)
| # of Non-Nulls | # of Nulls | % Nulls | Most Freq. Value | # of Unique | % Unique | |
|---|---|---|---|---|---|---|
| checking_status | 763 | 37 | 4.6% | no checking | 4 | 0.5% |
| credit_history | 800 | 0 | 0.0% | existing paid | 5 | 0.6% |
| purpose | 800 | 0 | 0.0% | radio/tv | 10 | 1.2% |
| savings_status | 800 | 0 | 0.0% | <100 | 5 | 0.6% |
| employment | 800 | 0 | 0.0% | 1<=X<4 | 5 | 0.6% |
| personal_status | 800 | 0 | 0.0% | male single | 4 | 0.5% |
| other_parties | 800 | 0 | 0.0% | none | 3 | 0.4% |
| property_magnitude | 800 | 0 | 0.0% | car | 4 | 0.5% |
| other_payment_plans | 800 | 0 | 0.0% | none | 3 | 0.4% |
| housing | 800 | 0 | 0.0% | own | 3 | 0.4% |
| job | 800 | 0 | 0.0% | skilled | 4 | 0.5% |
| own_telephone | 800 | 0 | 0.0% | none | 2 | 0.2% |
| foreign_worker | 800 | 0 | 0.0% | yes | 2 | 0.2% |
y_train[0:10]
array([1, 1, 0, 1, 0, 1, 0, 1, 1, 0])
np.unique(y_train, return_counts=True)
(array([0, 1]), array([559, 241]))
np.unique(y_train, return_counts=True)[1] / np.sum(np.unique(y_train, return_counts=True)[1])
array([0.69875, 0.30125])
from sklearn.preprocessing import OrdinalEncoder
OrdinalEncoder().fit_transform(X_train[['purpose', 'savings_status']])
array([[0., 2.],
[2., 2.],
[9., 1.],
...,
[9., 3.],
[6., 4.],
[6., 2.]])
numeric_columns = hlp.pandas.get_numeric_columns(X_train)
non_numeric_columns = hlp.pandas.get_non_numeric_columns(X_train)
print(numeric_columns)
print(non_numeric_columns)
['duration', 'credit_amount', 'installment_commitment', 'residence_since', 'age', 'existing_credits', 'num_dependents'] ['checking_status', 'credit_history', 'purpose', 'savings_status', 'employment', 'personal_status', 'other_parties', 'property_magnitude', 'other_payment_plans', 'housing', 'job', 'own_telephone', 'foreign_worker']
file_name = '../Logistic Regression/Run 1 - Logistic Regression - BayesSearchCV.yaml'
log_results = hlp.sklearn_eval.MLExperimentResults.from_yaml_file(yaml_file_name = file_name)
log_results.best_params
{'model': 'LogisticRegression()',
'C': 0.1749996766322668,
'imputer': 'SimpleImputer()',
'scaler': 'StandardScaler()',
'encoder': 'OneHotEncoder()'}
file_name = '../XGBoost/Run 1 - XGBoost - BayesSearchCV.yaml'
xgb_results = hlp.sklearn_eval.MLExperimentResults.from_yaml_file(yaml_file_name = file_name)
xgb_results.best_params
{'model': 'XGBClassifier()',
'max_depth': 8,
'learning_rate': 0.0036325173225203837,
'n_estimators': 1341,
'min_child_weight': 2,
'subsample': 0.7998768156287402,
'colsample_bytree': 0.8892648282704134,
'colsample_bylevel': 0.5213921375991398,
'reg_alpha': 0.39963322595869505,
'reg_lambda': 1.8262863809878243,
'imputer': "SimpleImputer(strategy='median')",
'scaler': 'None',
'encoder': 'OneHotEncoder()'}
xgb_params = xgb_results.best_params.copy()
del xgb_params['model']
del xgb_params['imputer']
del xgb_params['scaler']
del xgb_params['encoder']
xgb_params
{'max_depth': 8,
'learning_rate': 0.0036325173225203837,
'n_estimators': 1341,
'min_child_weight': 2,
'subsample': 0.7998768156287402,
'colsample_bytree': 0.8892648282704134,
'colsample_bylevel': 0.5213921375991398,
'reg_alpha': 0.39963322595869505,
'reg_lambda': 1.8262863809878243}
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
logistic_estimator = make_pipeline(
ColumnTransformer([
(
'numeric',
make_pipeline(
SimpleImputer(),
StandardScaler()
),
numeric_columns
),
(
'non_numeric',
make_pipeline(OneHotEncoder()),
non_numeric_columns
)
]),
LogisticRegression(
C=log_results.best_params['C'],
solver='lbfgs',
max_iter=1000,
random_state=42
)
)
xgb_estimator = make_pipeline(
ColumnTransformer([
(
'numeric',
make_pipeline(
SimpleImputer(strategy='median'),
#StandadScaler()
),
numeric_columns
),
(
'non_numeric',
# make_pipeline(hlp.sklearn_pipeline.CustomOrdinalEncoder()),
make_pipeline(OneHotEncoder()),
non_numeric_columns
)
]),
XGBClassifier(
random_state=42,
use_label_encoder=False,
eval_metric='logloss',
**xgb_params,
)
)
estimators = [
('logistic', logistic_estimator),
('xgb', xgb_estimator),
]
from sklearn.ensemble import StackingClassifier
final_estimator = XGBClassifier(random_state=42,
eval_metric='logloss',
use_label_encoder=False)
stacking_model = StackingClassifier(estimators=estimators, final_estimator=final_estimator)
https://towardsdatascience.com/xgboost-fine-tune-and-optimize-your-model-23d996fab663
max_depth: 3–10 n_estimators: 100 (lots of observations) to 1000 (few observations) learning_rate: 0.01–0.3 colsample_bytree: 0.5–1 subsample: 0.6–1
Then, you can focus on optimizing max_depth and n_estimators. You can then play along with the learning_rate, and increase it to speed up the model without decreasing the performances. If it becomes faster without losing in performances, you can increase the number of estimators to try to increase the performances.
Find tuning options with:
bayes_search.get_params().keys()
Note that the param will be e.g. final_estimator__max_depth even though bayes_search.get_params().keys() returns estimator__final_estimator__max_depth
NOTE: i ran into an issue when using passthrough=True (via BayesSearchCV, which isn't the problem), because of an issue related to https://github.com/scikit-learn/scikit-learn/issues/16473
I get
ValueError: Specifying the columns using strings is only supported for pandas DataFrames
Which is probably caused by the same issue in the link with the error:
make_column_selector can only be applied to pandas dataframes
The github issue says that While everything is going [good] if X is only numerical, things start to be complicated when we are dealing with mixed types and dataframe.
Thus I changed removed the non_numeric transformations and added , remainder="drop" so that it only passes though the non-numeric data.
This didn't work, probably because the issue is before the transformation. Otherwise the OneHotEncoding would have worked because all columns are numeric afterward.
# take advantage of defaults in ClassifierSearchSpace
# but remove non-existing pipelines and rename `model__` to `final_estimator__`
xgb_search_space = hlp.sklearn_search.ClassifierSearchSpace._search_space_xgboost()
del xgb_search_space['model']
del xgb_search_space['prep__numeric__imputer__transformer']
del xgb_search_space['prep__numeric__scaler__transformer']
del xgb_search_space['prep__non_numeric__encoder__transformer']
xgb_search_space = {key.replace('model__', 'final_estimator__'):value for key, value in xgb_search_space.items()}
xgb_search_space
{'final_estimator__max_depth': Integer(low=1, high=50, prior='log-uniform', transform='identity'),
'final_estimator__learning_rate': Real(low=0.0001, high=0.5, prior='uniform', transform='identity'),
'final_estimator__n_estimators': Integer(low=100, high=2000, prior='log-uniform', transform='identity'),
'final_estimator__min_child_weight': Integer(low=1, high=50, prior='log-uniform', transform='identity'),
'final_estimator__subsample': Real(low=0.5, high=1, prior='uniform', transform='identity'),
'final_estimator__colsample_bytree': Real(low=0.5, high=1, prior='uniform', transform='identity'),
'final_estimator__colsample_bylevel': Real(low=0.5, high=1, prior='uniform', transform='identity'),
'final_estimator__reg_alpha': Real(low=0.0001, high=1, prior='log-uniform', transform='identity'),
'final_estimator__reg_lambda': Real(low=1, high=4, prior='log-uniform', transform='identity')}
# pip install scikit-optimize
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.model_selection import RepeatedKFold
# search_space = {
# 'final_estimator__max_depth': Integer(3, 10),
# 'final_estimator__n_estimators': Integer(50, 2000),
# 'final_estimator__learning_rate': Real(0.01, 0.3),
# 'final_estimator__colsample_bytree': Real(0.01, 1),
# 'final_estimator__subsample': Real(0.5, 1),
# }
bayes_search = BayesSearchCV(
estimator=stacking_model,
search_spaces=xgb_search_space,
n_iter=50,
cv=RepeatedKFold(n_splits=5, n_repeats=2),
scoring='roc_auc',
# return_train_score=True,
n_jobs=-1,
verbose=1,
random_state=42,
)
start_time = time.time()
bayes_search.fit(X_train, y_train)
elapsed_time = time.time() - start_time
del search_space
# find tuning options with:
# bayes_search.get_params().keys()
print(f"Elapsed time to run BayesSearchCV: {elapsed_time:.3f} seconds; {elapsed_time / 60:.1f} minutes")
Elapsed time to run BayesSearchCV: 1277.584 seconds; 21.3 minutes
print(bayes_search.best_score_)
0.7718679525244111
print(bayes_search.best_params_)
OrderedDict([('final_estimator__colsample_bylevel', 0.5), ('final_estimator__colsample_bytree', 1.0), ('final_estimator__learning_rate', 0.0001), ('final_estimator__max_depth', 33), ('final_estimator__min_child_weight', 11), ('final_estimator__n_estimators', 100), ('final_estimator__reg_alpha', 0.0001), ('final_estimator__reg_lambda', 1.8504332622406525), ('final_estimator__subsample', 0.6980397351680853)])
new_param_column_names = {x:x.replace('final_estimator__', '') for x in xgb_search_space.keys()}
new_param_column_names
{'final_estimator__max_depth': 'max_depth',
'final_estimator__learning_rate': 'learning_rate',
'final_estimator__n_estimators': 'n_estimators',
'final_estimator__min_child_weight': 'min_child_weight',
'final_estimator__subsample': 'subsample',
'final_estimator__colsample_bytree': 'colsample_bytree',
'final_estimator__colsample_bylevel': 'colsample_bylevel',
'final_estimator__reg_alpha': 'reg_alpha',
'final_estimator__reg_lambda': 'reg_lambda'}
results = hlp.sklearn_eval.MLExperimentResults.from_sklearn_search_cv(
searcher=bayes_search,
higher_score_is_better = True,
parameter_name_mappings = new_param_column_names
)
yaml_file = 'Run 1 - Stacking - BayesSearchCV.yaml'
results.to_yaml_file(yaml_file_name = yaml_file)
results = hlp.sklearn_eval.MLExperimentResults.from_yaml_file(yaml_file_name = yaml_file)
results.best_score
0.7718679525244111
results.best_params
{'max_depth': 33,
'learning_rate': 0.0001,
'n_estimators': 100,
'min_child_weight': 11,
'subsample': 0.6980397351680853,
'colsample_bytree': 1.0,
'colsample_bylevel': 0.5,
'reg_alpha': 0.0001,
'reg_lambda': 1.8504332622406525}
results.to_formatted_dataframe()
| roc_auc Mean | roc_auc 95CI.LO | roc_auc 95CI.HI | max_depth | learning_rate | n_estimators | min_child_weight | subsample | colsample_bytree | colsample_bylevel | reg_alpha | reg_lambda |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.772 | 0.743 | 0.801 | 33 | 0.000 | 100 | 11 | 0.698 | 1.000 | 0.500 | 0.000 | 1.850 |
| 0.769 | 0.758 | 0.781 | 47 | 0.000 | 2,000 | 9 | 0.611 | 1.000 | 0.511 | 0.000 | 1.000 |
| 0.769 | 0.751 | 0.788 | 10 | 0.000 | 100 | 17 | 0.752 | 1.000 | 0.930 | 0.000 | 1.455 |
| 0.769 | 0.744 | 0.794 | 32 | 0.000 | 100 | 20 | 0.790 | 0.673 | 0.560 | 0.000 | 1.159 |
| 0.767 | 0.749 | 0.785 | 4 | 0.000 | 100 | 10 | 0.514 | 0.500 | 0.917 | 0.000 | 1.203 |
| 0.766 | 0.748 | 0.784 | 3 | 0.000 | 151 | 1 | 0.825 | 0.500 | 0.810 | 0.411 | 1.707 |
| 0.765 | 0.733 | 0.797 | 12 | 0.000 | 386 | 1 | 0.668 | 0.500 | 0.500 | 0.547 | 4.000 |
| 0.764 | 0.743 | 0.785 | 50 | 0.000 | 100 | 13 | 0.500 | 0.500 | 0.500 | 0.086 | 1.731 |
| 0.764 | 0.729 | 0.798 | 3 | 0.000 | 412 | 1 | 0.998 | 0.500 | 0.602 | 0.320 | 2.997 |
| 0.764 | 0.745 | 0.783 | 8 | 0.038 | 100 | 19 | 0.818 | 0.846 | 0.859 | 0.000 | 1.872 |
| 0.761 | 0.736 | 0.786 | 17 | 0.000 | 100 | 1 | 0.500 | 0.920 | 0.695 | 1.000 | 2.307 |
| 0.760 | 0.739 | 0.780 | 2 | 0.000 | 310 | 1 | 1.000 | 0.541 | 1.000 | 0.001 | 1.074 |
| 0.759 | 0.744 | 0.775 | 50 | 0.000 | 466 | 7 | 0.849 | 0.845 | 0.525 | 0.242 | 1.312 |
| 0.759 | 0.736 | 0.782 | 2 | 0.000 | 107 | 22 | 0.996 | 0.876 | 0.981 | 0.028 | 1.634 |
| 0.758 | 0.732 | 0.785 | 36 | 0.064 | 407 | 25 | 0.735 | 0.500 | 0.588 | 0.018 | 2.606 |
| 0.758 | 0.726 | 0.789 | 50 | 0.000 | 2,000 | 6 | 0.779 | 0.852 | 0.681 | 0.000 | 1.559 |
| 0.758 | 0.730 | 0.785 | 50 | 0.000 | 100 | 21 | 1.000 | 0.500 | 0.557 | 1.000 | 3.223 |
| 0.757 | 0.739 | 0.774 | 3 | 0.017 | 103 | 1 | 0.517 | 0.628 | 0.500 | 0.006 | 3.143 |
| 0.756 | 0.737 | 0.775 | 2 | 0.082 | 305 | 23 | 0.893 | 0.970 | 0.867 | 0.007 | 2.099 |
| 0.756 | 0.730 | 0.781 | 7 | 0.000 | 100 | 1 | 0.895 | 0.997 | 0.948 | 1.000 | 2.294 |
| 0.756 | 0.737 | 0.774 | 12 | 0.093 | 101 | 27 | 0.788 | 1.000 | 0.510 | 0.087 | 2.739 |
| 0.754 | 0.731 | 0.777 | 43 | 0.070 | 244 | 30 | 0.656 | 0.500 | 0.882 | 0.000 | 2.103 |
| 0.752 | 0.735 | 0.769 | 1 | 0.311 | 112 | 1 | 0.619 | 0.500 | 0.984 | 0.053 | 4.000 |
| 0.749 | 0.728 | 0.771 | 1 | 0.033 | 1,677 | 27 | 0.690 | 0.500 | 0.859 | 0.700 | 4.000 |
| 0.748 | 0.720 | 0.776 | 41 | 0.152 | 121 | 29 | 0.818 | 0.942 | 0.919 | 0.000 | 1.633 |
| 0.747 | 0.723 | 0.771 | 50 | 0.000 | 100 | 1 | 0.500 | 1.000 | 0.500 | 0.000 | 1.000 |
| 0.746 | 0.713 | 0.779 | 7 | 0.112 | 214 | 22 | 0.501 | 1.000 | 0.980 | 0.001 | 2.266 |
| 0.745 | 0.724 | 0.766 | 1 | 0.130 | 153 | 50 | 1.000 | 0.860 | 1.000 | 0.099 | 2.853 |
| 0.741 | 0.721 | 0.761 | 1 | 0.035 | 2,000 | 21 | 0.842 | 1.000 | 0.894 | 1.000 | 1.109 |
| 0.740 | 0.707 | 0.772 | 12 | 0.093 | 277 | 11 | 0.797 | 1.000 | 0.500 | 0.017 | 1.058 |
| 0.737 | 0.703 | 0.772 | 4 | 0.042 | 925 | 15 | 0.703 | 0.500 | 0.569 | 0.435 | 4.000 |
| 0.735 | 0.715 | 0.754 | 21 | 0.500 | 407 | 50 | 0.899 | 0.523 | 0.762 | 0.001 | 1.049 |
| 0.734 | 0.713 | 0.756 | 5 | 0.000 | 387 | 50 | 0.822 | 1.000 | 0.519 | 0.000 | 2.066 |
| 0.733 | 0.713 | 0.754 | 3 | 0.393 | 797 | 50 | 0.767 | 0.886 | 0.784 | 0.002 | 4.000 |
| 0.725 | 0.688 | 0.762 | 1 | 0.055 | 134 | 50 | 0.760 | 0.812 | 0.961 | 0.000 | 1.013 |
| 0.724 | 0.697 | 0.751 | 5 | 0.053 | 389 | 2 | 0.779 | 0.959 | 0.722 | 0.000 | 2.835 |
| 0.723 | 0.698 | 0.748 | 16 | 0.263 | 857 | 34 | 0.677 | 0.719 | 0.900 | 0.005 | 2.463 |
| 0.722 | 0.699 | 0.745 | 23 | 0.299 | 133 | 8 | 0.956 | 0.586 | 0.906 | 0.105 | 3.353 |
| 0.721 | 0.694 | 0.749 | 1 | 0.354 | 1,441 | 1 | 0.861 | 0.500 | 0.870 | 0.000 | 3.853 |
| 0.721 | 0.697 | 0.744 | 31 | 0.180 | 594 | 10 | 0.867 | 0.888 | 0.809 | 0.039 | 1.795 |
| 0.719 | 0.688 | 0.749 | 2 | 0.183 | 1,661 | 50 | 0.691 | 0.500 | 0.500 | 0.005 | 3.859 |
| 0.714 | 0.695 | 0.733 | 1 | 0.000 | 319 | 27 | 1.000 | 1.000 | 0.948 | 1.000 | 3.658 |
| 0.709 | 0.684 | 0.734 | 3 | 0.466 | 346 | 14 | 0.652 | 0.864 | 0.705 | 0.003 | 2.788 |
| 0.707 | 0.667 | 0.746 | 3 | 0.371 | 228 | 10 | 0.947 | 0.909 | 0.502 | 0.002 | 1.236 |
| 0.700 | 0.667 | 0.733 | 26 | 0.248 | 117 | 4 | 0.693 | 0.960 | 0.772 | 0.019 | 1.036 |
| 0.697 | 0.662 | 0.731 | 5 | 0.436 | 188 | 4 | 0.797 | 0.850 | 0.978 | 0.000 | 1.672 |
| 0.695 | 0.665 | 0.725 | 20 | 0.256 | 503 | 1 | 0.857 | 1.000 | 0.500 | 0.257 | 2.503 |
| 0.692 | 0.664 | 0.719 | 1 | 0.000 | 100 | 4 | 1.000 | 1.000 | 1.000 | 0.002 | 4.000 |
| 0.690 | 0.661 | 0.719 | 50 | 0.095 | 1,618 | 1 | 0.500 | 0.500 | 0.500 | 0.024 | 4.000 |
| 0.682 | 0.659 | 0.704 | 23 | 0.500 | 657 | 1 | 0.788 | 0.500 | 0.901 | 0.553 | 3.954 |
results.plot_performance_across_trials().show()
results.plot_performance_across_trials(size='learning_rate', color='max_depth').show()
results.plot_parameter_values_across_trials().show()
results.plot_scatter_matrix(height=1000, width=1000 * hlp.plot.GOLDEN_RATIO).show()
results.plot_performance_numeric_params(height=800)
results.plot_parallel_coordinates().show()
No non-numeric Variables
results.plot_score_vs_parameter(
parameter='learning_rate',
size='colsample_bytree',
)
roc_auc Mean¶score_variable = results.primary_score_name + ' Mean'
score_dataframe = results.to_dataframe()
score_dataframe = score_dataframe.drop(columns=[x for x in score_dataframe.columns
if x not in [score_variable] + results.parameter_names])
score_dataframe.head()
| roc_auc Mean | max_depth | learning_rate | n_estimators | min_child_weight | subsample | colsample_bytree | colsample_bylevel | reg_alpha | reg_lambda | |
|---|---|---|---|---|---|---|---|---|---|---|
| 43 | 0.771868 | 33 | 0.0001 | 100 | 11 | 0.698040 | 1.000000 | 0.500000 | 0.0001 | 1.850433 |
| 48 | 0.769437 | 47 | 0.0001 | 2000 | 9 | 0.610901 | 1.000000 | 0.510724 | 0.0001 | 1.000000 |
| 40 | 0.769298 | 10 | 0.0001 | 100 | 17 | 0.752268 | 1.000000 | 0.930188 | 0.0001 | 1.454899 |
| 41 | 0.769285 | 32 | 0.0001 | 100 | 20 | 0.790169 | 0.672919 | 0.560346 | 0.0001 | 1.158592 |
| 44 | 0.766914 | 4 | 0.0001 | 100 | 10 | 0.514002 | 0.500000 | 0.916846 | 0.0001 | 1.202716 |
cleaned_column_names = [''.join(e for e in x.replace(' ', '_') if e == '_' or e.isalnum()) for x in score_dataframe.columns.tolist()]
cleaned_column_names = dict(zip(score_dataframe.columns.tolist(), cleaned_column_names))
cleaned_column_names
{'roc_auc Mean': 'roc_auc_Mean',
'max_depth': 'max_depth',
'learning_rate': 'learning_rate',
'n_estimators': 'n_estimators',
'min_child_weight': 'min_child_weight',
'subsample': 'subsample',
'colsample_bytree': 'colsample_bytree',
'colsample_bylevel': 'colsample_bylevel',
'reg_alpha': 'reg_alpha',
'reg_lambda': 'reg_lambda'}
score_dataframe = score_dataframe.rename(columns=cleaned_column_names)
import statsmodels.formula.api as smf
y_column = 'roc_auc_Mean'
X_columns = score_dataframe.columns.tolist()
X_columns.remove(y_column)
X_columns = hlp.string.collapse(X_columns, separate=" + ", surround="")
formula = f"{y_column} ~ {X_columns}"
print(formula)
model = smf.ols(formula=formula, data = score_dataframe)
results = model.fit()
print(results.summary())
roc_auc_Mean ~ max_depth + learning_rate + n_estimators + min_child_weight + subsample + colsample_bytree + colsample_bylevel + reg_alpha + reg_lambda
OLS Regression Results
==============================================================================
Dep. Variable: roc_auc_Mean R-squared: 0.544
Model: OLS Adj. R-squared: 0.441
Method: Least Squares F-statistic: 5.293
Date: Sun, 13 Feb 2022 Prob (F-statistic): 9.36e-05
Time: 11:57:58 Log-Likelihood: 133.81
No. Observations: 50 AIC: -247.6
Df Residuals: 40 BIC: -228.5
Df Model: 9
Covariance Type: nonrobust
=====================================================================================
coef std err t P>|t| [0.025 0.975]
-------------------------------------------------------------------------------------
Intercept 0.8097 0.024 33.196 0.000 0.760 0.859
max_depth -6.3e-05 0.000 -0.347 0.731 -0.000 0.000
learning_rate -0.0970 0.019 -5.150 0.000 -0.135 -0.059
n_estimators -5.456e-06 4.95e-06 -1.103 0.277 -1.55e-05 4.55e-06
min_child_weight 0.0002 0.000 1.123 0.268 -0.000 0.001
subsample -0.0163 0.019 -0.856 0.397 -0.055 0.022
colsample_bytree -0.0341 0.014 -2.457 0.018 -0.062 -0.006
colsample_bylevel -0.0027 0.016 -0.167 0.869 -0.035 0.030
reg_alpha 0.0031 0.009 0.333 0.741 -0.016 0.022
reg_lambda -0.0073 0.003 -2.373 0.023 -0.014 -0.001
==============================================================================
Omnibus: 3.127 Durbin-Watson: 1.345
Prob(Omnibus): 0.209 Jarque-Bera (JB): 2.437
Skew: -0.537 Prob(JB): 0.296
Kurtosis: 3.130 Cond. No. 8.06e+03
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 8.06e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
import pandas as pd
scaler = StandardScaler()
#scaler.fit_transform(bayes_search_df)
numeric_columns = hlp.pandas.get_numeric_columns(score_dataframe)
non_numeric_columns = hlp.pandas.get_non_numeric_columns(score_dataframe)
print(numeric_columns)
print(non_numeric_columns)
numeric_pipeline = Pipeline([
('scaling', StandardScaler()),
])
transformations_pipeline = ColumnTransformer([
('numeric_pipeline', numeric_pipeline, numeric_columns),
('non_numeric_pipeline', 'passthrough', non_numeric_columns)
])
score_dataframe_transformed = transformations_pipeline.fit_transform(score_dataframe)
score_dataframe_transformed = pd.DataFrame(score_dataframe_transformed,
columns= numeric_columns + non_numeric_columns)
score_dataframe_transformed.head()
['roc_auc_Mean', 'max_depth', 'learning_rate', 'n_estimators', 'min_child_weight', 'subsample', 'colsample_bytree', 'colsample_bylevel', 'reg_alpha', 'reg_lambda'] []
| roc_auc_Mean | max_depth | learning_rate | n_estimators | min_child_weight | subsample | colsample_bytree | colsample_bylevel | reg_alpha | reg_lambda | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.285180 | 0.936112 | -0.772045 | -0.688637 | -0.338184 | -0.483009 | 1.099423 | -1.287650 | -0.569644 | -0.487683 |
| 1 | 1.186565 | 1.739150 | -0.772045 | 2.692792 | -0.465800 | -1.058052 | 1.099423 | -1.230902 | -0.569644 | -1.297975 |
| 2 | 1.180935 | -0.383164 | -0.772045 | -0.688637 | 0.044666 | -0.125152 | 1.099423 | 0.988652 | -0.569644 | -0.864548 |
| 3 | 1.180408 | 0.878752 | -0.772045 | -0.688637 | 0.236091 | 0.124967 | -0.451276 | -0.968335 | -0.569644 | -1.146869 |
| 4 | 1.084221 | -0.727323 | -0.772045 | -0.688637 | -0.401992 | -1.697498 | -1.271092 | 0.918056 | -0.569644 | -1.104828 |
score_dataframe_transformed['roc_auc_Mean'] = score_dataframe_transformed['roc_auc_Mean'].astype('float')
score_dataframe_transformed['max_depth'] = score_dataframe_transformed['max_depth'].astype('float')
score_dataframe_transformed['learning_rate'] = score_dataframe_transformed['learning_rate'].astype('float')
score_dataframe_transformed['n_estimators'] = score_dataframe_transformed['n_estimators'].astype('float')
score_dataframe_transformed['min_child_weight'] = score_dataframe_transformed['min_child_weight'].astype('float')
score_dataframe_transformed['subsample'] = score_dataframe_transformed['subsample'].astype('float')
score_dataframe_transformed['colsample_bytree'] = score_dataframe_transformed['colsample_bytree'].astype('float')
score_dataframe_transformed['colsample_bylevel'] = score_dataframe_transformed['colsample_bylevel'].astype('float')
score_dataframe_transformed['reg_alpha'] = score_dataframe_transformed['reg_alpha'].astype('float')
score_dataframe_transformed['reg_lambda'] = score_dataframe_transformed['reg_lambda'].astype('float')
print(formula)
model = smf.ols(formula=formula,
data = score_dataframe_transformed)
results = model.fit()
print(results.summary())
roc_auc_Mean ~ max_depth + learning_rate + n_estimators + min_child_weight + subsample + colsample_bytree + colsample_bylevel + reg_alpha + reg_lambda
OLS Regression Results
==============================================================================
Dep. Variable: roc_auc_Mean R-squared: 0.544
Model: OLS Adj. R-squared: 0.441
Method: Least Squares F-statistic: 5.293
Date: Sun, 13 Feb 2022 Prob (F-statistic): 9.36e-05
Time: 11:58:16 Log-Likelihood: -51.338
No. Observations: 50 AIC: 122.7
Df Residuals: 40 BIC: 141.8
Df Model: 9
Covariance Type: nonrobust
=====================================================================================
coef std err t P>|t| [0.025 0.975]
-------------------------------------------------------------------------------------
Intercept 1.749e-15 0.107 1.64e-14 1.000 -0.216 0.216
max_depth -0.0446 0.128 -0.347 0.731 -0.304 0.215
learning_rate -0.6038 0.117 -5.150 0.000 -0.841 -0.367
n_estimators -0.1244 0.113 -1.103 0.277 -0.352 0.104
min_child_weight 0.1253 0.112 1.123 0.268 -0.100 0.351
subsample -0.1004 0.117 -0.856 0.397 -0.337 0.137
colsample_bytree -0.2919 0.119 -2.457 0.018 -0.532 -0.052
colsample_bylevel -0.0207 0.124 -0.167 0.869 -0.271 0.230
reg_alpha 0.0402 0.121 0.333 0.741 -0.203 0.284
reg_lambda -0.3105 0.131 -2.373 0.023 -0.575 -0.046
==============================================================================
Omnibus: 3.127 Durbin-Watson: 1.345
Prob(Omnibus): 0.209 Jarque-Bera (JB): 2.437
Skew: -0.537 Prob(JB): 0.296
Kurtosis: 3.130 Cond. No. 2.21
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
coefficients = pd.DataFrame({
'feature': results.params.index,
'coefficient': results.params,
'p_value': results.pvalues,
})
coefficients = coefficients.query("feature != 'Intercept'")
coefficients['Stat Sig'] = coefficients['p_value'] <= 0.05
coefficients
| feature | coefficient | p_value | Stat Sig | |
|---|---|---|---|---|
| max_depth | max_depth | -0.044554 | 0.730598 | False |
| learning_rate | learning_rate | -0.603812 | 0.000007 | True |
| n_estimators | n_estimators | -0.124366 | 0.276806 | False |
| min_child_weight | min_child_weight | 0.125306 | 0.268114 | False |
| subsample | subsample | -0.100389 | 0.397130 | False |
| colsample_bytree | colsample_bytree | -0.291943 | 0.018425 | True |
| colsample_bylevel | colsample_bylevel | -0.020670 | 0.868524 | False |
| reg_alpha | reg_alpha | 0.040172 | 0.740719 | False |
| reg_lambda | reg_lambda | -0.310491 | 0.022535 | True |
score_variable
'roc_auc Mean'
px.bar(
data_frame=coefficients.reindex(coefficients['coefficient'].abs().sort_values(ascending=True).index),
y='feature',
x='coefficient',
color='Stat Sig',
title=f"Regression Coefficients of Hyper-parameters against '{score_variable}'",
height=600,
width=600*hlp.plot.GOLDEN_RATIO
)
from sklearn.inspection import permutation_importance
estimator = bayes_search.best_estimator_
start_time = time.time()
result = permutation_importance(
estimator, X_train, y_train, n_repeats=10, random_state=42, n_jobs=2
)
elapsed_time = time.time() - start_time
print(f"Elapsed time to compute the importances: {elapsed_time:.3f} seconds")
feature_names = X_train.columns.to_list()
forest_importances = pd.Series(result.importances_mean, index=feature_names)
forest_importances = forest_importances.sort_values(ascending=False)
Elapsed time to compute the importances: 8.666 seconds
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=result.importances_std, ax=ax)
ax.set_title("Feature importances using permutation on full model")
ax.set_ylabel("Mean accuracy decrease")
fig.set_size_inches(9, 6)
fig.tight_layout()
plt.show()
temp = X_train.copy()
temp['default'] = y_train
temp.groupby('foreign_worker').agg({'default': np.mean})
| default | |
|---|---|
| foreign_worker | |
| yes | 0.308290 |
| no | 0.107143 |
fig = px.box(
data_frame=temp,
y='age',
x='default',
# size=size_variable,
# color=color_variable,
# trendline='lowess',
# labels={
# score_variable: f"Average Cross Validation Score ({parser.primary_score_name})",
# },
# title=f"<b>{x_variable}</b> - Performance<br>" \
# f"<sup>Size of point corresponds to '{size_variable}'</sup>",
# custom_data=['labels'],
height=600,
width=600*hlp.plot.GOLDEN_RATIO
)
fig.show()
NOTE: foreign worker seems like it should be important but is ranked last in feature importance.